Paisley data set available here.
# Install packages (if needed)
install.packages("tidyverse")
install.packages("xlsx")
install.packages("writexl")
# Start afresh
rm(list=ls()) # Clear de "Global Environment"
# Load the packages you need
library(tidyverse)
library(readxl)
library(writexl)
# Working directory
getwd() # Provides the current working directory
# Helpful to see how the paths are defined (Windows/Mac differences)
setwd("/Volumes/francijb/Documents/FRAN/Teaching/QM Research School/Session") # Mac
setwd("W:/Documents/FRAN/Teaching/QM Research School/Session") # Windows
# RStudio menu helps finding the path:
# Session/Set working directory/Choose directory
# Copy and paste the chosen path from the console to the script
# Import data
paisley_data <- read_excel("/Volumes/francijb/Documents/FRAN/Teaching/QM Research School/Datasets/paisley_data.xls")
paisley_data <- read_excel("W:/Documents/FRAN/Teaching/QM Research School/Datasets/paisley_data.xls")
# if the data is in the working directory, you don't need the path (just the file name)
# Inspecting the data
paisley_data # A peek
# It also indicates whether categorical (character) or numerical (double...)
view(paisley_data) # The whole dataset
#################### DESCRIPTIVE STATISTICS (1 variable) ###################
############## Categorical (qualitative) variables ############
# sex literacy employed ...
# Frequency table (tabulate)
# 1 variable:
# number of males and females
table(paisley_data$sex)
# number by country of birth
table(paisley_data$countryb)
# express it as a proportion (instead of a count)
prop.table(table(paisley_data$sex))
# or
paisley_data %>%
group_by(sex) %>% # dimension we focus on
summarize(n = n()) %>% # reporting number of observations in each group
mutate(freq = n/sum(n)) # creating a new variable with the relative frequency
# with literacy
paisley_data %>%
group_by(literacy) %>%
summarize(n = n()) %>%
mutate(freq = n/sum(n))
# focusing on a subsample
paisley_data %>%
filter(sex == "male") %>% # "filter" restricts the analysis to those
group_by(literacy) %>% # observations fulfilling that condition
summarize(n = n()) %>%
mutate(freq = n/sum(n))
### Create an output that you can use later
table1 <- paisley_data %>%
filter(sex == "male") %>%
group_by(literacy) %>%
summarize(n = n()) %>%
mutate(freq = n/sum(n))
view(table1)
# Export the output (object) into excel
write_xlsx(table1, path = "table1.xlsx")
###### Plotting frequencies (graph bar)
# use "+" to add features to the graph
# sex
ggplot(data = paisley_data) +
geom_bar(mapping = aes(x = sex))
# or
paisley_data %>%
ggplot() +
geom_bar(mapping = aes(x = sex))
# literacy
paisley_data %>%
ggplot() +
geom_bar(mapping = aes(x = literacy))
# horizontal to facilitate reading the categories
paisley_data %>%
ggplot() +
geom_bar(mapping = aes(x = literacy)) +
coord_flip()
# proportions (instead of counts)
paisley_data %>%
ggplot() +
geom_bar(mapping = aes(x = literacy, y = stat(prop), group = 1)) +
coord_flip()
# editing the graph
paisley_data %>%
ggplot() +
geom_bar(mapping = aes(x = literacy, y = stat(prop), group = 1)) +
coord_flip() + ylab("Proportion") + xlab("Literacy")
# or all together:
# labs(title ="", subtitle = "", x = "", etc etc)
# many options for editing graphs...
# save the graph
ggsave("lit.png", dpi = 320)
#### Recategorising
# Creating a new variable and recoding new values
# i.e. simplify literacy so there are not so many categories
paisley_data %>%
group_by(literacy) %>%
summarize(n = n()) %>%
mutate(freq = n/sum(n))
paisley_data <- mutate(paisley_data, lit_adj = recode(literacy,
"superior education" = "write",
"read well" = "read",
"read tolerably" = "read",
"read a little" = "read",
"read & write well" = "write",
"read & write tolerably" = "write",
"read & write a little" = "read",
"cannot write" = "read")
)
# "mutate" creates a new variable based on an existing one (literacy)
# "recode" makes the adjustments we ask for
# note that there is no need to "illiterate" = "illiterate"
# (because it is already in the original variable)
# "paisley_data <-" tells R to alter the dataset
# Check the outcome (always)
View(paisley_data)
table(paisley_data$lit_adj)
paisley_data %>%
ggplot() +
geom_bar(mapping = aes(x = lit_adj)) +
coord_flip()
#### Ranking (ordering)
# Assign order to qualitative variables
# The computer does not distinguish between categorical values
# Note that they are presented in alphabetical order
paisley_data %>%
group_by(literacy) %>%
summarize(n = n()) %>%
mutate(freq = n/sum(n))
paisley_data$lit_adj2 <- factor(paisley_data$literacy,
levels = c("illiterate", "read a little", "read tolerably", "read well",
"cannot write", "read & write a little", "read & write tolerably",
"read & write well", "superior education"),
ordered = TRUE)
# the ranking is given by the order of categories listed in "levels" (in ascending order)
# notice that we have created a new variable
paisley_data$lit_adj2
# although the values have not changed, they have been assigned a ranking: see "Levels" in the last line of the results
# checking the result
paisley_data %>%
ggplot() +
geom_bar(mapping = aes(x = lit_adj2)) +
coord_flip()
# excluding now the missing values (NA): "!is.na(lit_adj)"
paisley_data %>%
filter(!is.na(lit_adj2)) %>%
ggplot() +
geom_bar(mapping = aes(x = lit_adj2)) +
coord_flip()
# Ranking can be also obtained by assigning numerical values directly:
paisley_data <- mutate(paisley_data, lit_adj3 = recode(literacy,
"superior education" = 8,
"read & write well" = 7,
"read & write tolerably" = 6,
"read & write a little" = 5,
"cannot write" = 4,
"read well" = 3,
"read tolerably" = 2,
"read a little" = 1,
"illiterate" = 0)
)
view(paisley_data)
paisley_data %>%
group_by(lit_adj3) %>%
summarize(n = n()) %>%
mutate(freq = n/sum(n))
ggplot(data = filter(paisley_data, !is.na(lit_adj3))) + # excluding the missing values (NA)
geom_bar(mapping = aes(x = lit_adj2))
### Categorise more complicated patterns
# i.e. the variable "marks" provides many "messy" categories; how to make sense of it
# more on this with Gregory
paisley_data %>%
group_by(marks) %>%
summarize(n = n()) %>%
print(n = Inf) # display all rows
install.packages("stringr")
library(stringr)
# identify the prisoners where the term "scar" is mentioned in "marks"
paisley_data <- paisley_data %>%
mutate(scar = ifelse(str_detect(marks, "scar"), 1, 0))
# new variable named "cut" with value of 1 when R detects the term "cut" (and 0 otherwise)
view(paisley_data)
paisley_data %>%
subset(select=c(marks, scar)) %>%
print(n = 100)
# or alternatively:
# first create a new variable
paisley_data <- paisley_data %>%
mutate(marks_adj = marks)
# keep replacing values
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "scar")] <- "scar"
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "cut")] <- "cut"
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "mark")] <- "mark"
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "no mark")] <- "none"
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "nomark")] <- "none"
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "blind")] <- "blind"
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "lame")] <- "lame"
paisley_data %>%
subset(select=c(marks, marks_adj)) %>%
print(n = 100)
paisley_data %>%
group_by(marks_adj) %>%
summarize(n = n()) %>%
print(n = Inf) # display all rows
############## Numerical variables ############
# age, weight, height, ...
paisley_data
# Reporting frequencies is not useful when there are many values
paisley_data %>%
group_by(age) %>%
summarize(n = n()) %>%
mutate(freq = n/sum(n)) %>%
print(n = Inf) # display all rows
# Solution: Create class intervals (group values into bins)
paisley_data <- paisley_data %>%
mutate(age_class = cut(age, breaks = 5)) # Creates groups of equal size
# or setting when the breaks happen yourself
paisley_data <- paisley_data %>%
mutate(age_class = cut(age, breaks = c(0, 14, 19, 50, Inf)))
# or even assign "labels" to those groups
paisley_data <- paisley_data %>%
mutate(age_class = cut(age, breaks = c(0, 14, 19, 50, Inf),
labels = c("Children", "Youngters", "Adults", "Elderly")))
# check
paisley_data %>%
group_by(age_class) %>%
summarize(n = n()) %>%
mutate(freq = n/sum(n)) %>%
print(n = Inf)
## Summarise:
# count: n(), mean, median, min, max, sd, IQR, min, quantile(x, 0.25)...
# first, nth(x, 2), last
# !is.na(x), n_distinct(x)
# average age
summarise(paisley_data, age = mean(age, na.rm = TRUE))
# na.rm removes missing values from the computations
# instead of yielding missing outputs (check removing that condition)
# or
paisley_data %>%
summarize(count = sum(!is.na(age)), # "!is.na" indicates "is not na" (not available=missing value)
mean_age = mean(age, na.rm = TRUE), # the comma allows for asking for more statistics
min_age = min(age, na.rm = TRUE),
max_age = max(age, na.rm = TRUE),
p25_age = quantile(age, 0.25, na.rm = TRUE),
p75_age = quantile(age, 0.75, na.rm = TRUE)
)
## or
summary(paisley_data$age, na.rm = T)
# Summarise by group(s):
paisley_data %>%
group_by(sex) %>%
summarize(count = sum(!is.na(age)),
mean_age = mean(age, na.rm = TRUE),
min_age = min(age, na.rm = TRUE),
max_age = max(age, na.rm = TRUE)
)
# by several groups
paisley_data %>%
group_by(sex, countryb) %>%
summarize(count = sum(!is.na(age)),
mean_age = mean(age, na.rm = TRUE),
median_age = median(age, na.rm = TRUE)
)
# store the results in a new object
by_sex_countryb <- paisley_data %>%
group_by(sex, countryb) %>%
summarize(count = sum(!is.na(age)),
mean_age = mean(age, na.rm = TRUE),
median_age = median(age, na.rm = TRUE)
)
view(by_sex_countryb)
# Export to excel (note that is appended as a differet sheet in the previous file)
write_xlsx(by_sex_countryb, path = "table2.xlsx")
## Histogram (visual representation of the distribution)
paisley_data %>%
ggplot() +
geom_histogram(mapping = aes(x = age), binwidth = 5)
# Play around with the width of the bin: accuracy vs noise
ggsave("hist_age.pdf") # save plot
# Focusing on particular subsamples
paisley_data %>%
filter(sex == "male") %>%
ggplot() +
geom_histogram(mapping = aes(x = age), binwidth = 5)
# Or by group
paisley_data %>%
ggplot() +
geom_histogram(mapping = aes(x = age), binwidth = 5) +
facet_wrap(~ sex, nrow = 1)
## Outlier / recode value
summary(paisley_data$age, na.rm = T)
subset(paisley_data, age==160)
# or
outlier <- subset(paisley_data, age==160)
view(outlier)
# if we check the source, it was a typo, real age 16, so we correct it:
paisley_data$age[paisley_data$age == 160] <- 16
# check the result
subset(paisley_data, forename=="FRANCIS" & surname=="GILFILLAN", select=c(forename, surname, age))
paisley_data %>%
ggplot() +
geom_histogram(mapping = aes(x = age), binwidth = 5) +
facet_wrap(~ sex, nrow = 1)
# Overlapping histograms
paisley_data %>%
ggplot(mapping = aes(x = age, colour = sex)) +
geom_freqpoly(binwidth = 5)
# With relative frequency (instead of counts) so as to compare different-size groups
paisley_data %>%
ggplot() +
geom_histogram(mapping = aes(x = age, y = ..density..), binwidth = 5) +
facet_wrap(~ sex, nrow = 1)
##### Modifying / creating variables
# Inspect info on heights: feet, inches
view(paisley_data)
paisley_data %>%
ggplot() +
geom_histogram(mapping = aes(x = feet), binwidth = 1)
# something is going on
paisley_data %>%
summarize(count = sum(!is.na(feet)),
mean_feet = mean(feet, na.rm = TRUE),
min_feet = min(feet, na.rm = TRUE),
max_feet = max(feet, na.rm = TRUE)
)
subset(paisley_data, feet==50)
# correct outlier
paisley_data$feet[paisley_data$feet == 50] <- 5
# check you dit it right
subset(paisley_data, forename=="FRANCIS" & surname=="GILFILLAN", select=c(forename, surname, feet))
# or
paisley_data %>%
ggplot() +
geom_histogram(mapping = aes(x = feet), binwidth = 1)
# The variable "inches" looks ok
paisley_data %>%
ggplot() +
geom_histogram(mapping = aes(x = inches), binwidth = 1)
# Create nex variable (height) combining both variables (feet & inches)
paisley_data <- mutate(paisley_data,
height = 30.48*feet+2.54*inches)
view(paisley_data)
# a new variable has been added (last column on the right)
# visualise new variable
paisley_data %>%
ggplot() +
geom_histogram(mapping = aes(x = height), binwidth = 5)
# Group a numerical variable into different bins
paisley_data <- paisley_data %>%
mutate(height_bins = case_when(height < 145 ~ 'low',
height >=145 & height< 175 ~ 'med',
height >=175 ~ 'high'))
# check
paisley_data %>%
group_by(height_bins) %>%
summarize(n = n()) %>%
mutate(freq = n/sum(n))
##### Save the data ####
# The data has somewhat change: correct typos, add new variables...
# We can save the new dataset as a R file, so we can come back to that later
write_rds(paisley_data, "paisley_v2.rds")
paisley_data_1 <- read_rds("paisley_v2.rds")
#### Measures of dispersion: range, iqr, variance, standard deviation ####
# Distributions differ not only in their means (medians...) but also in
# how dispersed their values are
paisley_data %>%
group_by(sex) %>%
summarize(count = sum(!is.na(age)),
mean_age = mean(age, na.rm = TRUE),
min_age = min(age, na.rm = TRUE),
max_age = max(age, na.rm = TRUE),
range_age = max(age, na.rm = TRUE) - min(age, na.rm = TRUE), # range: max - min
iqr_age = IQR(age, na.rm = TRUE), # interquartile range: p75 - p25
var_age = var(age, na.rm = TRUE), # variance
sd_age = sd(age, na.rm = TRUE), # standard deviation
cv_age = sd(age, na.rm = TRUE)/mean(age, na.rm = TRUE) # coefficient of variation
)
paisley_data %>%
filter(age >= 18) %>%
group_by(sex) %>%
summarize(count = sum(!is.na(height)),
mean = mean(height, na.rm = TRUE),
min = min(height, na.rm = TRUE),
max = max(height, na.rm = TRUE),
range = max(height, na.rm = TRUE) - min(height, na.rm = TRUE), # range: max - min
iqr = IQR(height, na.rm = TRUE), # interquartile range: p75 - p25
var = var(height, na.rm = TRUE), # variance
sd = sd(height, na.rm = TRUE), # standard deviation
cv = sd(height, na.rm = TRUE)/mean(height, na.rm = TRUE) # coefficient of variation
)
paisley_data %>%
ggplot() +
ggplot(data = paisley_data, mapping = aes(x = age, colour = sex)) +
geom_freqpoly(binwidth = 5)
# Visualising
paisley_data %>%
filter(age >= 18) %>%
ggplot() +
geom_histogram(mapping = aes(x = height), binwidth = 2) +
facet_wrap(~ sex, nrow = 1)
2022.